{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get scRNA-seq TPM"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### AP et al."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MGH264_A01</th>\n",
       "      <th>MGH264_A02</th>\n",
       "      <th>MGH264_A03</th>\n",
       "      <th>MGH264_A04</th>\n",
       "      <th>MGH264_A05</th>\n",
       "      <th>MGH264_A06</th>\n",
       "      <th>MGH264_A07</th>\n",
       "      <th>MGH264_A08</th>\n",
       "      <th>MGH264_A10</th>\n",
       "      <th>MGH264_A11</th>\n",
       "      <th>...</th>\n",
       "      <th>MGH26FCS</th>\n",
       "      <th>MGH26Tumor</th>\n",
       "      <th>MGH28CSC</th>\n",
       "      <th>MGH28FCS</th>\n",
       "      <th>MGH28Tumor</th>\n",
       "      <th>MGH29Tumor</th>\n",
       "      <th>MGH30Tumor</th>\n",
       "      <th>MGH31CSC</th>\n",
       "      <th>MGH31FCS</th>\n",
       "      <th>MGH31Tumor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A2M</th>\n",
       "      <td>-3.801470</td>\n",
       "      <td>-3.801470</td>\n",
       "      <td>-3.801470</td>\n",
       "      <td>-3.801470</td>\n",
       "      <td>-3.801470</td>\n",
       "      <td>-1.987700</td>\n",
       "      <td>-3.801470</td>\n",
       "      <td>-3.801470</td>\n",
       "      <td>-3.801470</td>\n",
       "      <td>-3.801470</td>\n",
       "      <td>...</td>\n",
       "      <td>-6.484534</td>\n",
       "      <td>-3.038651</td>\n",
       "      <td>-3.409382</td>\n",
       "      <td>-3.984825</td>\n",
       "      <td>3.112043</td>\n",
       "      <td>4.161139</td>\n",
       "      <td>1.792081</td>\n",
       "      <td>4.694467</td>\n",
       "      <td>3.826620</td>\n",
       "      <td>2.180888</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAS</th>\n",
       "      <td>-3.889900</td>\n",
       "      <td>-3.889900</td>\n",
       "      <td>-3.889900</td>\n",
       "      <td>-3.889900</td>\n",
       "      <td>3.742495</td>\n",
       "      <td>-3.889900</td>\n",
       "      <td>-3.889900</td>\n",
       "      <td>-3.889900</td>\n",
       "      <td>-3.889900</td>\n",
       "      <td>4.316243</td>\n",
       "      <td>...</td>\n",
       "      <td>-1.392566</td>\n",
       "      <td>-0.183522</td>\n",
       "      <td>0.844202</td>\n",
       "      <td>-0.872499</td>\n",
       "      <td>-0.050085</td>\n",
       "      <td>-0.019764</td>\n",
       "      <td>-0.124966</td>\n",
       "      <td>1.493167</td>\n",
       "      <td>1.468188</td>\n",
       "      <td>-1.114900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAK1</th>\n",
       "      <td>-3.985616</td>\n",
       "      <td>-3.158708</td>\n",
       "      <td>1.733125</td>\n",
       "      <td>-1.665669</td>\n",
       "      <td>-2.166992</td>\n",
       "      <td>4.691156</td>\n",
       "      <td>2.656469</td>\n",
       "      <td>-1.914759</td>\n",
       "      <td>0.483560</td>\n",
       "      <td>1.828663</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.570789</td>\n",
       "      <td>-0.080933</td>\n",
       "      <td>3.096310</td>\n",
       "      <td>0.273106</td>\n",
       "      <td>-0.307181</td>\n",
       "      <td>0.200401</td>\n",
       "      <td>-0.327583</td>\n",
       "      <td>-0.265878</td>\n",
       "      <td>0.887639</td>\n",
       "      <td>-1.929420</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAMP</th>\n",
       "      <td>2.651558</td>\n",
       "      <td>2.358992</td>\n",
       "      <td>-5.820241</td>\n",
       "      <td>3.514271</td>\n",
       "      <td>-5.820241</td>\n",
       "      <td>-4.006471</td>\n",
       "      <td>2.207608</td>\n",
       "      <td>2.417820</td>\n",
       "      <td>-5.820241</td>\n",
       "      <td>2.140173</td>\n",
       "      <td>...</td>\n",
       "      <td>-2.243000</td>\n",
       "      <td>-0.121189</td>\n",
       "      <td>-0.131112</td>\n",
       "      <td>-0.544799</td>\n",
       "      <td>-0.171306</td>\n",
       "      <td>0.866269</td>\n",
       "      <td>0.227523</td>\n",
       "      <td>0.991496</td>\n",
       "      <td>1.506706</td>\n",
       "      <td>1.015800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AARS</th>\n",
       "      <td>2.170748</td>\n",
       "      <td>-6.041792</td>\n",
       "      <td>-6.041792</td>\n",
       "      <td>-6.041792</td>\n",
       "      <td>2.094729</td>\n",
       "      <td>-3.449348</td>\n",
       "      <td>-6.041792</td>\n",
       "      <td>3.162904</td>\n",
       "      <td>-6.041792</td>\n",
       "      <td>-6.041792</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.706095</td>\n",
       "      <td>-0.325803</td>\n",
       "      <td>0.075450</td>\n",
       "      <td>-1.467605</td>\n",
       "      <td>0.563490</td>\n",
       "      <td>-0.364656</td>\n",
       "      <td>-0.628892</td>\n",
       "      <td>-0.016801</td>\n",
       "      <td>0.103757</td>\n",
       "      <td>3.180410</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 543 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      MGH264_A01  MGH264_A02  MGH264_A03  MGH264_A04  MGH264_A05  MGH264_A06  \\\n",
       "A2M    -3.801470   -3.801470   -3.801470   -3.801470   -3.801470   -1.987700   \n",
       "AAAS   -3.889900   -3.889900   -3.889900   -3.889900    3.742495   -3.889900   \n",
       "AAK1   -3.985616   -3.158708    1.733125   -1.665669   -2.166992    4.691156   \n",
       "AAMP    2.651558    2.358992   -5.820241    3.514271   -5.820241   -4.006471   \n",
       "AARS    2.170748   -6.041792   -6.041792   -6.041792    2.094729   -3.449348   \n",
       "\n",
       "      MGH264_A07  MGH264_A08  MGH264_A10  MGH264_A11  ...  MGH26FCS  \\\n",
       "A2M    -3.801470   -3.801470   -3.801470   -3.801470  ... -6.484534   \n",
       "AAAS   -3.889900   -3.889900   -3.889900    4.316243  ... -1.392566   \n",
       "AAK1    2.656469   -1.914759    0.483560    1.828663  ... -0.570789   \n",
       "AAMP    2.207608    2.417820   -5.820241    2.140173  ... -2.243000   \n",
       "AARS   -6.041792    3.162904   -6.041792   -6.041792  ... -0.706095   \n",
       "\n",
       "      MGH26Tumor  MGH28CSC  MGH28FCS  MGH28Tumor  MGH29Tumor  MGH30Tumor  \\\n",
       "A2M    -3.038651 -3.409382 -3.984825    3.112043    4.161139    1.792081   \n",
       "AAAS   -0.183522  0.844202 -0.872499   -0.050085   -0.019764   -0.124966   \n",
       "AAK1   -0.080933  3.096310  0.273106   -0.307181    0.200401   -0.327583   \n",
       "AAMP   -0.121189 -0.131112 -0.544799   -0.171306    0.866269    0.227523   \n",
       "AARS   -0.325803  0.075450 -1.467605    0.563490   -0.364656   -0.628892   \n",
       "\n",
       "      MGH31CSC  MGH31FCS  MGH31Tumor  \n",
       "A2M   4.694467  3.826620    2.180888  \n",
       "AAAS  1.493167  1.468188   -1.114900  \n",
       "AAK1 -0.265878  0.887639   -1.929420  \n",
       "AAMP  0.991496  1.506706    1.015800  \n",
       "AARS -0.016801  0.103757    3.180410  \n",
       "\n",
       "[5 rows x 543 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "AP_df = pd.read_csv('../data/external_scRNAseq/GSE57872_GBM_data_matrix.txt', sep='\\t', index_col=0)\n",
    "\n",
    "AP_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "532"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cell_name_list = [c for c in AP_df.columns if '_' in c]\n",
    "sample_cell_pair = [[c] + [c.split('_')[0]] for c in cell_name_list]\n",
    "\n",
    "len(cell_name_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cell_id</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_id</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CSC6</th>\n",
       "      <td>44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CSC8</th>\n",
       "      <td>58</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MGH26</th>\n",
       "      <td>53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MGH264</th>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MGH28</th>\n",
       "      <td>94</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MGH29</th>\n",
       "      <td>75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MGH30</th>\n",
       "      <td>73</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MGH31</th>\n",
       "      <td>70</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           cell_id\n",
       "sample_id         \n",
       "CSC6            44\n",
       "CSC8            58\n",
       "MGH26           53\n",
       "MGH264          65\n",
       "MGH28           94\n",
       "MGH29           75\n",
       "MGH30           73\n",
       "MGH31           70"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "AP_cell_meta_df = pd.DataFrame(sample_cell_pair, columns=['cell_id', 'sample_id'])\n",
    "AP_cell_meta_df = AP_cell_meta_df.set_index('cell_id')\n",
    "AP_cell_meta_df.reset_index().groupby('sample_id').count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "AP_tpm_df = np.power(2, AP_df[cell_name_list])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((5948,), 5948)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "AP_tpm_df.index.shape, len(set(AP_tpm_df.index))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### SP et al."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/miniconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3051: DtypeWarning: Columns (1,2,4,8,11,13,14,15,16,17,19,25,27,28,29,32,33,34,36,39,40,41,42,43,48,49,51,52,54,56,59,60,61,64,65,68,69,70,74,77,78,79,80,82,83,84,87,88,90,91,93,94,96,99,100,102,103,104,106,110,112,113,116,122,123,124,125,127,128,131,132,133,134,139,142,143,144,148,149,151,152,153,155,156,160,163,169,170,171,172,175,177,179,180,181,182,183,184,185,186,190,192,193,194,195,196,197,198,203,204,205,206,208,210,211,213,215,217,218,220,221,224,227,229,230,231,233,234,237,238,239,241,242,243,250,251,253,255,257,258,261,266,268,269,272,273,275,276,277,279,280,281,282,283,284,286,289,293,294,296,297,302,304,308,309,310,311,312,315,316,323,325,329,330,331,332,335,337,339,341,342,343,344,346,347,348,352,356,359,360,361,363,365,368,369,374,375,377,379,382,384,386,388,390,392,393,394,395,396,399,400,401,403,406,407,408,409,411,412,414,415,417,418,420,422,423,425,427,428,429,430,431,433,435,436,438,439,444,446,448,450,451,452,454,455,456,457,458,459,461,466,467,468,470,475,479,484,485,486,487,488,489,491,492,493,494,495,499,501,504,505,507,508,510,512,515,516,518,521,522,523,524,525,526,528,530,535,537,538,542,543,545,547,549,550,551,552,553,555,557,558,559,561,562,565,566,567,568,569,572,574,576,577,578,579,583,584,585,586,587,588,591,592,593,594,600,601,606,607,608,609,612,613,614,615,617,618,619,620,621,622,623,625,626,628,629,631,632,633,634,637,638,639,641,642,643,644,645,646,647,648,649,650,651,652,653,654,655,656,657,658,659,660,661,662,663,665,666,667,670,671,673,675,676,677,678,679,680,681,682,683,684,685,686,688,689,690,691,692,693,694,695,696,697,698,699,700,703,704,705,706,708,709,710,711,712,716,717,718,721,722,723,724,725,726,727,728,729,730,732,733,734,735,736,737,739,740,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,760,761,762,763,764,765,766,767,768,769,770,771,772,774,776,777,778,779,780,781,782,783,784,785,786,789,790,791,792,793,795,796,797,799,800,801,802,803,804,805,808,809,810,811,812,815,816,817,818,819,820,821,822,824,825,826,827,828,829,830,832,833,834,835,836,838,839,840,841,843,844,845,848,849,850,851,852,854,855,857,858,861,862,863,864,865,866,867,869,870,871,872,873,874,876,877,878,879,880,881,882,883,885,886,887,888,889,890,891,892,893,894,895,896,897,898,899,900,901,902,903,904,905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,928,929,930,931,934,935,936,937,941,942,945,946,947,948,949,954,955,956,958,959,960,961,963,966,968,969,972,973,975,977,981,982,983,984,985,986,988,990,991,992,995,997,999,1000,1002,1003,1004,1005,1006,1007,1009,1011,1013,1016,1018,1019,1021,1022,1023,1024,1027,1028,1032,1034,1036,1037,1039,1041,1042,1043,1044,1045,1046,1050,1051,1056,1057,1061,1063,1064,1065,1070,1071,1074,1077,1084,1098,1099,1100,1102,1107,1109,1110,1113,1114,1115,1117,1118,1119,1120,1121,1122,1123,1124,1125,1127,1128,1129,1131,1132,1133,1134,1135,1137,1138,1139,1140,1141,1142,1143,1144,1145,1146,1147,1148,1149,1150,1151,1152,1155,1156,1157,1158,1159,1160,1161,1162,1163,1164,1165,1167,1168,1172,1173,1174,1176,1177,1178,1180,1181,1182,1183,1184,1185,1186,1187,1188,1189,1190,1191,1192,1193,1194,1195,1196,1197,1198,1199,1200,1201,1202,1203,1204,1205,1206,1208,1209,1210,1211,1212,1213,1214,1215,1216,1217,1219,1220,1221,1222,1225,1226,1227,1229,1230,1231,1233,1234,1235,1236,1237,1238,1239,1241,1242,1243,1244,1245,1246,1247,1248,1249,1252,1253,1254,1255,1256,1257,1258,1259,1260,1262,1263,1264,1265,1266,1267,1268,1269,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279,1280,1282,1283,1285,1286,1287,1288,1289,1290,1291,1292,1293,1294,1295,1296,1298,1299,1300,1301,1302,1304,1305,1306,1307,1309,1311,1312,1314,1317,1318,1319,1320,1321,1323,1324,1325,1326,1327,1328,1329,1330,1331,1332,1333,1334,1335,1337,1338,1339,1340,1341,1342,1343,1344,1345,1346,1348,1349,1350,1352,1353,1354,1355,1356,1357,1358,1360,1361,1362,1363,1364,1365,1366,1367,1368,1369,1370,1372,1373,1375,1376,1377,1379,1388,1391,1395,1396,1403,1405,1407,1422,1425,1437,1442,1450,1458,1472,1475,1492,1506,1514,1518,1520,1523,1524,1525,1526,1527,1528,1529,1530,1531,1535,1538,1541,1545,1546,1547,1548,1550,1551,1552,1553,1554,1557,1560,1561,1562,1563,1565,1567,1568,1569,1570,1572,1574,1577,1579,1580,1581,1586,1588,1589,1590,1592,1593,1594,1595,1596,1600,1601,1603,1605,1606,1609,1614,1617,1618,1619,1620,1621,1622,1623,1624,1625,1627,1628,1629,1631,1632,1633,1634,1635,1636,1637,1638,1639,1640,1641,1643,1644,1645,1647,1650,1654,1655,1656,1657,1658,1659,1660,1661,1663,1664,1665,1667,1669,1670,1672,1673,1675,1676,1677,1679,1680,1681,1682,1683,1685,1688,1689,1690,1691,1695,1697,1698,1700,1701,1702,1703,1704,1705,1708,1709,1710,1712,1713,1717,1719,1720,1721,1723,1724,1725,1726,1729,1730,1731,1732,1733,1734,1735,1736,1737,1738,1739,1740,1741,1742,1744,1745,1746,1749,1750,1752,1753,1754,1755,1758,1759,1760,1761,1762,1765,1766,1767,1768,1769,1771,1772,1774,1775,1776,1778,1779,1780,1781,1782,1783,1784,1785,1786,1787,1788,1789,1790,1792,1794,1795,1796,1798,1799,1800,1801,1803,1805,1806,1807,1810,1811,1812,1813,1815,1816,1817,1819,1822,1823,1824,1825,1826,1827,1828,1830,1832,1834,1838,1839,1840,1841,1842,1843,1845,1846,1847,1848,1849,1850,1852,1855,1856,1857,1858,1860,1862,1863,1865,1866,1867,1868,1869,1870,1871,1872,1873,1876,1878,1880,1881,1883,1884,1885,1887,1892,1893,1899,1900,1902,1903,1904,1913,1916,1919,1920,1921,1925,1938,1939,1940,1947,1949,1950,1954,1959,1964,1966,1968,1971,1973,1974,1975,1976,1977,1980,1981,1982,1983,1986,1988,1989,1991,1994,1995,1996,1999,2000,2001,2003,2004,2005,2006,2007,2009,2011,2012,2013,2014,2015,2016,2022,2023,2024,2025,2026,2028,2030,2031,2032,2033,2034,2035,2036,2038,2040,2041,2042,2043,2044,2045,2047,2049,2052,2053,2056,2057,2060,2061,2062,2063,2064,2066,2067,2069,2070,2072,2073,2074,2075,2076,2077,2080,2081,2082,2083,2084,2086,2087,2088,2089,2091,2092,2093,2094,2096,2100,2101,2105,2107,2108,2112,2113,2114,2115,2116,2118,2120,2123,2124,2126,2127,2128,2130,2132,2133,2135,2139,2140,2141,2142,2144,2145,2146,2149,2150,2152,2154,2155,2156,2159,2162,2164,2165,2166,2168,2169,2170,2171,2172,2176,2178,2179,2182,2184,2189,2192,2193,2194,2195,2197,2200,2201,2202,2203,2206,2209,2211,2212,2224,2245,2270,2279,2295,2308,2318,2326,2348,2351,2352,2375,2481,2525,2731,2734,2736,2738,2739,2741,2742,2745,2747,2748,2749,2750,2752,2753,2754,2755,2756,2757,2759,2760,2761,2762,2763,2764,2765,2767,2769,2770,2771,2772,2773,2775,2776,2778,2779,2780,2782,2785,2786,2787,2793,2796,2797,2799,2800,2801,2802,2803,2804,2811,2812,2813,2815,2816,2822,2824,2825,2827,2828,2829,2831,2832,2833,2834,2837,2839,2841,2842,2843,2846,2847,2848,2849,2850,2851,2852,2854,2857,2858,2859,2860,2862,2863,2866,2867,2868,2869,2871,2872,2874,2875,2878,2879,2880,2882,2883,2884,2886,2887,2888,2889,2890,2891,2893,2894,2896,2898,2899,2900,2901,2902,2904,2905,2907,2910,2912,2913,2914,2915,2916,2917,2918,2922,2923,2924,2925,2926,2928,2929,2930,2931,2932,2933,2935,2936,2938,2940,2941,2942,2943,2944,2945,2947,2949,2950,2951,2953,2954,2956,2957,2958,2959,2961,2962,2964,2965,2966,2967,2968,2969,2970,2971,2972,2973,2974,2976,2977,2978,2981,2982,2983,2984,2986,2987,2988,2990,2991,2993,2995,2996,2997,2998,2999,3000,3001,3002,3003,3004,3005,3009,3010,3011,3016,3017,3018,3019,3021,3022,3023,3026,3028,3029,3032,3033,3035,3036,3037,3038,3041,3042,3043,3044,3046,3047,3048,3049,3051,3052,3053,3054,3055,3057,3059,3060,3061,3063,3068,3069,3070,3071,3072,3074,3077,3078,3080,3081,3082,3084,3085,3088,3089,3090,3092,3093,3094,3095,3097,3098,3099,3100,3101,3102,3103,3104,3105,3107,3110,3111,3112,3113,3115,3118,3119,3120,3122,3124,3125,3127,3128,3129,3130,3131,3133,3134,3138,3139,3142,3146,3147,3148,3150,3151,3152,3154,3155,3156,3157,3159,3162,3164,3165,3166,3167,3168,3170,3171,3172,3173,3175,3176,3179,3180,3181,3183,3187,3190,3193,3195,3196,3198,3199,3200,3201,3202,3205,3206,3207,3209,3210,3211,3213,3217,3218,3219,3220,3221,3222,3228,3229,3230,3231,3233,3234,3236,3237,3238,3239,3240,3242,3243,3244,3246,3247,3248,3249,3251,3252,3253,3254,3255,3256,3257,3258,3259,3263,3264,3266,3267,3268,3271,3272,3273,3274,3276,3277,3278,3279,3281,3282,3283,3285,3286,3288,3289,3290,3291,3292,3293,3294,3295,3296,3299,3300,3301,3302,3303,3304,3306,3308,3309,3310,3311,3312,3313,3315,3316,3319,3320,3321,3322,3323,3325,3326,3328,3330,3333,3335,3336,3337,3338,3344,3345,3346,3347,3348,3349,3350,3351,3352,3353,3354,3355,3356,3358,3359,3360,3362,3364,3365,3366,3367,3368,3369,3373,3375,3376,3378,3380,3382,3383,3384,3385,3390,3391,3392,3393,3394,3396,3398,3399,3400,3402,3403,3405,3406,3407,3409,3410,3411,3412,3413,3415,3416,3417,3420,3421,3422,3425,3426,3435,3436,3437,3438,3441,3442,3443,3446,3447,3448,3449,3450,3451,3453,3456,3459,3461,3462,3466,3467,3469,3470,3472,3475,3476,3479,3481,3482,3484,3485,3486,3488,3490,3491,3493,3495,3496,3497,3500,3502,3503,3506,3508,3509,3511,3512,3513,3514,3515,3516,3517,3518,3523,3524,3526,3527,3528,3530,3531,3532,3535,3536,3537,3538,3539,3540,3542,3543,3544,3545,3547,3549,3551,3552,3553,3555,3556,3557,3559,3561,3562,3564,3565,3566,3567,3568,3570,3571,3572,3573,3575,3576,3579,3580,3581,3585,3589,3590,3591,3592,3593,3594,3596,3597,3599,3602,3603,3606,3607,3609,3611,3612,3614,3616,3617,3618,3621,3623,3624,3625,3626,3627,3628,3629,3632,3634,3635,3636,3637,3638,3640,3642,3644,3645,3649,3650,3651,3652,3655,3656,3658,3660,3661,3663,3664,3665,3667,3668,3669,3670,3671,3673,3674,3675,3676,3677,3678,3680,3681,3682,3683,3684,3685,3687,3692,3693,3695,3696,3697,3698,3699,3701,3702,3703,3705,3706,3707,3708,3710,3712,3726,3769,3801,3802,3803,3804,3805,3806,3807,3808,3809,3810,3811,3812,3813,3814,3815,3816,3817,3818,3819,3820,3821,3822,3823,3824,3825,3826,3827,3828,3829,3830,3831,3832,3833,3834,3835,3836,3837,3838,3839,3840,3841,3842,3843,3844,3845,3846,3847,3848,3849,3850,3851,3852,3853,3854,3855,3856,3857,3858,3859,3860,3861,3862,3863,3864,3865,3866,3867,3868,3869,3870,3871,3872,3873,3874,3875,3876,3877,3878,3879,3880,3881,3882,3883,3884,3885,3886,3887,3888,3889,3890,3891,3892,3893,3894,3895,3896,3897,3898,3899,3900,3901,3902,3903,3904,3905,3906,3907,3908,3909,3910,3911,3912,3913,3914,3915,3917,3918,3919,3920,3921,3922,3946,3953,3978,3995,3998,3999,4000,4002,4003,4004,4005,4007,4008,4009,4011,4016,4017,4018,4019,4021,4023,4025,4028,4029,4030,4032,4033,4036,4037,4039,4040,4041,4042,4044,4045,4046,4047,4048,4051,4052,4053,4057,4058,4059,4060,4061,4062,4063,4064,4065,4066,4067,4068,4069,4070,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080,4081,4082,4083,4084,4085,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095,4096,4097,4098,4099,4100,4101,4102,4103,4104,4105,4106,4107,4108,4109,4110,4111,4112,4113,4114,4115,4116,4117,4118,4119,4120,4121,4122,4123,4124,4125,4126,4127,4128,4129,4130,4131,4132,4133,4136,4137,4138,4139,4142,4143,4144,4145,4146,4147,4149,4151,4152,4153,4156,4157,4158,4159,4162,4163,4165,4167,4168,4169,4170,4172,4173,4174,4175,4176,4177,4179,4183,4187,4189,4191,4193,4195,4196,4197,4198,4200,4201,4204,4208,4210,4213,4214,4218,4220,4221,4225,4226,4228,4229,4231,4235,4237,4238,4242,4243,4244,4246,4247,4248,4249,4250,4252,4253,4254,4255,4256,4258,4260,4261,4262,4263,4264,4265,4266,4267,4268,4269,4270,4271,4272,4273,4274,4275,4276,4277,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299,4300,4301,4302,4303,4304,4305,4306,4307,4308,4309,4310,4311,4312,4313,4314,4315,4316,4317,4318,4319,4320,4321,4322,4323,4324,4325,4326,4327,4328,4329,4330,4331,4332,4333,4334,4335,4336,4337,4338,4339,4340,4341,4342,4343,4344,4345,4346,4347,4348,4349,4350,4351,4352,4353,4354,4355,4356,4357,4358,4359,4360,4361,4362,4363,4364,4365,4366,4367,4368,4369,4370,4371,4372,4373,4374,4375,4376,4377,4378,4379,4380,4381,4382,4383,4384,4385,4386,4387,4388,4389,4390,4391,4392,4393,4394,4395,4396,4397,4398,4399,4400,4401,4402,4403,4404,4405,4406,4407,4408,4409,4410,4411,4412,4413,4414,4415,4416,4417,4418,4419,4420,4423,4424,4426,4427,4429,4431,4432,4435,4436,4437,4439,4440,4441,4442,4443,4444,4445,4446,4447,4448,4449,4450,4452,4454,4456,4457,4460,4461,4462,4467,4468,4469,4470,4472,4473,4474,4475,4476,4477,4478,4479,4482,4483,4484,4485,4486,4487,4488,4489,4490,4491,4493,4494,4496,4497,4498,4499,4502,4503,4504,4505,4507,4508,4509,4510,4511,4512,4513,4514,4515,4516,4518,4519,4523,4525,4526,4527,4528,4529,4530,4531,4532,4533,4534,4535,4536,4537,4538,4539,4540,4541,4542,4543,4544,4545,4546,4547,4548,4549,4550,4551,4552,4553,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4572,4573,4574,4575,4576,4577,4578,4579,4580,4581,4582,4583,4584,4585,4586,4587,4588,4589,4590,4591,4592,4593,4594,4595,4596,4597,4598,4599,4600,4601,4602,4603,4604,4605,4606,4607,4608,4609,4610,4611,4612,4613,4614,4615,4616,4617,4618,4619,4620,4621,4622,4623,4624,4625,4626,4627,4628,4629,4630,4631,4632,4633,4634,4635,4636,4637,4638,4640,4641,4642,4643,4644,4645,4646,4647,4648,4649,4650,4651,4652,4653,4654,4655,4656,4657,4658,4659,4660,4661,4662,4663,4664,4665,4666,4667,4668,4669,4670,4671,4672,4673,4674,4675,4677,4678,4680,4681,4683,4685,4688,4692,4693,4694,4695,4701,4703,4704,4705,4712,4715,4717,4718,4720,4722,4724,4725,4727,4728,4730,4735,4736,4737,4738,4739,4740,4741,4742,4743,4744,4745,4746,4747,4748,4749,4750,4751,4752,4753,4754,4755,4756,4757,4758,4759,4760,4761,4762,4763,4764,4765,4766,4767,4768,4769,4771,4772,4773,4774,4776,4777,4778,4779,4780,4781,4782,4783,4784,4794,4800,4801,4802,4803,4804,4805,4806,4807,4808,4809,4810,4811,4812,4813,4814,4815,4816,4817,4818,4819,4820,4821,4822,4823,4824,4825,4826,4827,4828,4829,4830,4831,4832,4833,4834,4835,4836,4837,4838,4839,4840,4841,4842,4843,4844,4845,4847,4849,4850,4851,4853,4854,4859,4861,4862,4863,4865,4866,4867,4868,4869,4870,4872,4873,4874,4875,4876,4877,4878,4880,4881,4882,4884,4885,4886,4887,4888,4889,4890,4891,4892,4893,4894,4896,4897,4898,4899,4900,4901,4902,4903,4904,4906,4907,4908,4909,4910,4911,4912,4913,4914,4915,4916,4917,4918,4919,4920,4921,4922,4923,4925,4926,4927,4929,4930,4931,4932,4936,4937,4939,4940,4941,4942,4943,4944,4945,4948,4978,4979,4980,4981,4982,4983,4984,4985,4986,4987,4988,4989,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999,5000,5001,5002,5003,5004,5005,5006,5007,5008,5009,5010,5011,5012,5013,5014,5015,5016,5017,5018,5019,5020,5021,5022,5023,5024,5025,5026,5027,5028,5029,5030,5031,5032,5033,5035,5042,5043,5048,5050,5054,5056,5057,5058,5067,5068,5072,5075,5080,5081,5082,5084,5086,5095,5097,5098,5103,5108,5109,5110,5112,5116,5119,5121,5123,5124,5126,5129,5135,5136,5138,5139,5142,5143,5144,5146,5147,5150,5151,5152,5154,5155,5157,5158,5160,5161,5162,5163,5164,5166,5167,5168,5169,5172,5173,5174,5175,5176,5177,5179,5181,5183,5184,5185,5187,5188,5191,5193,5194,5195,5196,5197,5198,5199,5200,5201,5202,5203,5204,5205,5206,5207,5208,5209,5210,5211,5212,5213,5214,5215,5216,5217,5218,5219,5220,5221,5223,5224,5225,5226,5227,5228,5229,5230,5231,5232,5233,5234,5235,5236,5237,5238,5239,5240,5241,5242,5244,5245,5246,5247,5248,5249,5250,5251,5252,5253,5254,5255,5256,5257,5259,5260,5261,5262,5263,5264,5265,5266,5267,5268,5269,5270,5271,5272,5273,5274,5275,5276,5277,5278,5279,5280,5281,5282,5283,5284,5285,5286,5287,5288,5289,5290,5291,5292,5293,5294,5295,5296,5298,5302,5305,5310,5311,5312,5314,5316,5317,5318,5319,5325,5327,5331,5332,5335,5337,5338,5341,5342,5343,5344,5345,5346,5347,5349,5350,5351,5354,5355,5358,5359,5360,5361,5362,5363,5364,5365,5366,5367,5368,5369,5370,5371,5372,5374,5375,5376,5380,5381,5382,5385,5386,5388,5389,5392,5393,5394,5396,5398,5399,5402,5409,5413,5414,5415,5418,5421,5423,5425,5427,5428,5431,5433,5439,5440,5441,5443,5444,5446,5447,5449,5450,5451,5454,5456,5462,5464,5465,5466,5468,5469,5470,5471,5472,5474,5477,5478,5479,5480,5483,5484,5487,5488,5489,5491,5492,5493,5495,5496,5501,5503,5508,5510,5511,5512,5513,5514,5515,5516,5517,5518,5519,5520,5521,5522,5523,5524,5525,5526,5527,5528,5530,5531,5532,5533,5534,5535,5536,5537,5538,5539,5540,5541,5542,5543,5544,5545,5546,5547,5548,5549,5550,5551,5552,5553,5554,5555,5556,5557,5558,5559,5560,5561,5562,5563,5564,5565,5566,5567,5568,5569,5571,5572,5573,5574,5575,5576,5577,5579,5581,5583,5586,5587,5589,5595,5603,5604,5607,5610,5611,5615,5616,5617,5618,5620,5623,5626,5631,5633,5635,5636,5637,5638,5639,5641,5642,5643,5645,5648,5739,5754) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    }
   ],
   "source": [
    "SP_df = pd.read_csv('../data/external_scRNAseq/GSE103322_HNSCC_all_data.txt', sep='\\t', index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>HN28_P15_D06_S330_comb</th>\n",
       "      <th>HN28_P6_G05_S173_comb</th>\n",
       "      <th>HN26_P14_D11_S239_comb</th>\n",
       "      <th>HN26_P14_H05_S281_comb</th>\n",
       "      <th>HN26_P25_H09_S189_comb</th>\n",
       "      <th>HN26_P14_H06_S282_comb</th>\n",
       "      <th>HN25_P25_C04_S316_comb</th>\n",
       "      <th>HN26_P25_A11_S107_comb</th>\n",
       "      <th>HN26_P25_C09_S129_comb</th>\n",
       "      <th>HNSCC26_P24_H05_S377_comb</th>\n",
       "      <th>...</th>\n",
       "      <th>HNSCC20_P3_B10_S22_comb</th>\n",
       "      <th>HNSCC20_P13_B11_S215_comb</th>\n",
       "      <th>HNSCC20_P3_C08_S32_comb</th>\n",
       "      <th>HNSCC17_P4_H03_S183_comb</th>\n",
       "      <th>HNSCC20_P3_F09_S69_comb</th>\n",
       "      <th>HNSCC17_P4_G12_S180_comb</th>\n",
       "      <th>HNSCC20_P13_C05_S221_comb</th>\n",
       "      <th>HNSCC17_P4_C12_S132_comb</th>\n",
       "      <th>HNSCC20_P3_H08_S92_comb</th>\n",
       "      <th>HNSCC20_P3_G06_S78_comb</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>processed by Maxima enzyme</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>1</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Lymph node</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>1</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>classified  as cancer cell</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>0</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0000</td>\n",
       "      <td>1.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>classified as non-cancer cells</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>1</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>non-cancer cell type</th>\n",
       "      <td>Fibroblast</td>\n",
       "      <td>Fibroblast</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>Fibroblast</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>Fibroblast</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'C9orf152'</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.42761</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'RPS11'</th>\n",
       "      <td>6.0037</td>\n",
       "      <td>7.3006</td>\n",
       "      <td>7.28850</td>\n",
       "      <td>0</td>\n",
       "      <td>7.47420</td>\n",
       "      <td>6.9548</td>\n",
       "      <td>5.9743</td>\n",
       "      <td>6.465</td>\n",
       "      <td>6.5628</td>\n",
       "      <td>7.9486</td>\n",
       "      <td>...</td>\n",
       "      <td>6.8747</td>\n",
       "      <td>5.96490</td>\n",
       "      <td>4.8252</td>\n",
       "      <td>7.5167</td>\n",
       "      <td>7.3249</td>\n",
       "      <td>5.3595</td>\n",
       "      <td>7.4281</td>\n",
       "      <td>6.8439</td>\n",
       "      <td>6.8676</td>\n",
       "      <td>6.3146</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'ELMO2'</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>5.2465</td>\n",
       "      <td>0.50487</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>3.4154</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>1.9613</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.62106</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>3.2863</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>3.5905</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'CREB3L1'</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'PNMA1'</th>\n",
       "      <td>5.1474</td>\n",
       "      <td>5.3329</td>\n",
       "      <td>2.83370</td>\n",
       "      <td>5.7507</td>\n",
       "      <td>0.19661</td>\n",
       "      <td>1.9949</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>2.1541</td>\n",
       "      <td>3.5815</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>...</td>\n",
       "      <td>3.0332</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>5.3462</td>\n",
       "      <td>0.0000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10 rows × 5902 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                               HN28_P15_D06_S330_comb HN28_P6_G05_S173_comb  \\\n",
       "processed by Maxima enzyme                          1                     1   \n",
       "Lymph node                                          1                     0   \n",
       "classified  as cancer cell                          0                     0   \n",
       "classified as non-cancer cells                      1                     1   \n",
       "non-cancer cell type                       Fibroblast            Fibroblast   \n",
       "'C9orf152'                                          0                     0   \n",
       "'RPS11'                                        6.0037                7.3006   \n",
       "'ELMO2'                                             0                     0   \n",
       "'CREB3L1'                                           0                     0   \n",
       "'PNMA1'                                        5.1474                5.3329   \n",
       "\n",
       "                                HN26_P14_D11_S239_comb HN26_P14_H05_S281_comb  \\\n",
       "processed by Maxima enzyme                     1.00000                      1   \n",
       "Lymph node                                     1.00000                      1   \n",
       "classified  as cancer cell                     1.00000                      0   \n",
       "classified as non-cancer cells                 0.00000                      1   \n",
       "non-cancer cell type                           0.00000             Fibroblast   \n",
       "'C9orf152'                                     0.42761                      0   \n",
       "'RPS11'                                        7.28850                      0   \n",
       "'ELMO2'                                        0.00000                 5.2465   \n",
       "'CREB3L1'                                      0.00000                      0   \n",
       "'PNMA1'                                        2.83370                 5.7507   \n",
       "\n",
       "                                HN26_P25_H09_S189_comb  \\\n",
       "processed by Maxima enzyme                     1.00000   \n",
       "Lymph node                                     1.00000   \n",
       "classified  as cancer cell                     1.00000   \n",
       "classified as non-cancer cells                 0.00000   \n",
       "non-cancer cell type                           0.00000   \n",
       "'C9orf152'                                     0.00000   \n",
       "'RPS11'                                        7.47420   \n",
       "'ELMO2'                                        0.50487   \n",
       "'CREB3L1'                                      0.00000   \n",
       "'PNMA1'                                        0.19661   \n",
       "\n",
       "                                HN26_P14_H06_S282_comb  \\\n",
       "processed by Maxima enzyme                      1.0000   \n",
       "Lymph node                                      1.0000   \n",
       "classified  as cancer cell                      1.0000   \n",
       "classified as non-cancer cells                  0.0000   \n",
       "non-cancer cell type                            0.0000   \n",
       "'C9orf152'                                      0.0000   \n",
       "'RPS11'                                         6.9548   \n",
       "'ELMO2'                                         0.0000   \n",
       "'CREB3L1'                                       0.0000   \n",
       "'PNMA1'                                         1.9949   \n",
       "\n",
       "                                HN25_P25_C04_S316_comb HN26_P25_A11_S107_comb  \\\n",
       "processed by Maxima enzyme                      1.0000                      1   \n",
       "Lymph node                                      1.0000                      1   \n",
       "classified  as cancer cell                      1.0000                      0   \n",
       "classified as non-cancer cells                  0.0000                      1   \n",
       "non-cancer cell type                            0.0000             Fibroblast   \n",
       "'C9orf152'                                      0.0000                      0   \n",
       "'RPS11'                                         5.9743                  6.465   \n",
       "'ELMO2'                                         0.0000                 3.4154   \n",
       "'CREB3L1'                                       0.0000                      0   \n",
       "'PNMA1'                                         0.0000                 2.1541   \n",
       "\n",
       "                                HN26_P25_C09_S129_comb  \\\n",
       "processed by Maxima enzyme                      1.0000   \n",
       "Lymph node                                      1.0000   \n",
       "classified  as cancer cell                      1.0000   \n",
       "classified as non-cancer cells                  0.0000   \n",
       "non-cancer cell type                            0.0000   \n",
       "'C9orf152'                                      0.0000   \n",
       "'RPS11'                                         6.5628   \n",
       "'ELMO2'                                         0.0000   \n",
       "'CREB3L1'                                       0.0000   \n",
       "'PNMA1'                                         3.5815   \n",
       "\n",
       "                                HNSCC26_P24_H05_S377_comb  ...  \\\n",
       "processed by Maxima enzyme                         1.0000  ...   \n",
       "Lymph node                                         1.0000  ...   \n",
       "classified  as cancer cell                         1.0000  ...   \n",
       "classified as non-cancer cells                     0.0000  ...   \n",
       "non-cancer cell type                               0.0000  ...   \n",
       "'C9orf152'                                         0.0000  ...   \n",
       "'RPS11'                                            7.9486  ...   \n",
       "'ELMO2'                                            1.9613  ...   \n",
       "'CREB3L1'                                          0.0000  ...   \n",
       "'PNMA1'                                            0.0000  ...   \n",
       "\n",
       "                               HNSCC20_P3_B10_S22_comb  \\\n",
       "processed by Maxima enzyme                      0.0000   \n",
       "Lymph node                                      0.0000   \n",
       "classified  as cancer cell                      1.0000   \n",
       "classified as non-cancer cells                  0.0000   \n",
       "non-cancer cell type                            0.0000   \n",
       "'C9orf152'                                      0.0000   \n",
       "'RPS11'                                         6.8747   \n",
       "'ELMO2'                                         0.0000   \n",
       "'CREB3L1'                                       0.0000   \n",
       "'PNMA1'                                         3.0332   \n",
       "\n",
       "                                HNSCC20_P13_B11_S215_comb  \\\n",
       "processed by Maxima enzyme                        0.00000   \n",
       "Lymph node                                        1.00000   \n",
       "classified  as cancer cell                        1.00000   \n",
       "classified as non-cancer cells                    0.00000   \n",
       "non-cancer cell type                              0.00000   \n",
       "'C9orf152'                                        0.00000   \n",
       "'RPS11'                                           5.96490   \n",
       "'ELMO2'                                           0.62106   \n",
       "'CREB3L1'                                         0.00000   \n",
       "'PNMA1'                                           0.00000   \n",
       "\n",
       "                               HNSCC20_P3_C08_S32_comb  \\\n",
       "processed by Maxima enzyme                      0.0000   \n",
       "Lymph node                                      0.0000   \n",
       "classified  as cancer cell                      1.0000   \n",
       "classified as non-cancer cells                  0.0000   \n",
       "non-cancer cell type                            0.0000   \n",
       "'C9orf152'                                      0.0000   \n",
       "'RPS11'                                         4.8252   \n",
       "'ELMO2'                                         0.0000   \n",
       "'CREB3L1'                                       0.0000   \n",
       "'PNMA1'                                         0.0000   \n",
       "\n",
       "                               HNSCC17_P4_H03_S183_comb  \\\n",
       "processed by Maxima enzyme                       0.0000   \n",
       "Lymph node                                       0.0000   \n",
       "classified  as cancer cell                       1.0000   \n",
       "classified as non-cancer cells                   0.0000   \n",
       "non-cancer cell type                             0.0000   \n",
       "'C9orf152'                                       0.0000   \n",
       "'RPS11'                                          7.5167   \n",
       "'ELMO2'                                          0.0000   \n",
       "'CREB3L1'                                        0.0000   \n",
       "'PNMA1'                                          0.0000   \n",
       "\n",
       "                               HNSCC20_P3_F09_S69_comb  \\\n",
       "processed by Maxima enzyme                      0.0000   \n",
       "Lymph node                                      0.0000   \n",
       "classified  as cancer cell                      1.0000   \n",
       "classified as non-cancer cells                  0.0000   \n",
       "non-cancer cell type                            0.0000   \n",
       "'C9orf152'                                      0.0000   \n",
       "'RPS11'                                         7.3249   \n",
       "'ELMO2'                                         0.0000   \n",
       "'CREB3L1'                                       0.0000   \n",
       "'PNMA1'                                         0.0000   \n",
       "\n",
       "                               HNSCC17_P4_G12_S180_comb  \\\n",
       "processed by Maxima enzyme                       0.0000   \n",
       "Lymph node                                       0.0000   \n",
       "classified  as cancer cell                       0.0000   \n",
       "classified as non-cancer cells                   0.0000   \n",
       "non-cancer cell type                             0.0000   \n",
       "'C9orf152'                                       0.0000   \n",
       "'RPS11'                                          5.3595   \n",
       "'ELMO2'                                          0.0000   \n",
       "'CREB3L1'                                        0.0000   \n",
       "'PNMA1'                                          0.0000   \n",
       "\n",
       "                               HNSCC20_P13_C05_S221_comb  \\\n",
       "processed by Maxima enzyme                        0.0000   \n",
       "Lymph node                                        1.0000   \n",
       "classified  as cancer cell                        1.0000   \n",
       "classified as non-cancer cells                    0.0000   \n",
       "non-cancer cell type                              0.0000   \n",
       "'C9orf152'                                        0.0000   \n",
       "'RPS11'                                           7.4281   \n",
       "'ELMO2'                                           0.0000   \n",
       "'CREB3L1'                                         0.0000   \n",
       "'PNMA1'                                           0.0000   \n",
       "\n",
       "                                HNSCC17_P4_C12_S132_comb  \\\n",
       "processed by Maxima enzyme                        0.0000   \n",
       "Lymph node                                        0.0000   \n",
       "classified  as cancer cell                        1.0000   \n",
       "classified as non-cancer cells                    0.0000   \n",
       "non-cancer cell type                              0.0000   \n",
       "'C9orf152'                                        0.0000   \n",
       "'RPS11'                                           6.8439   \n",
       "'ELMO2'                                           3.2863   \n",
       "'CREB3L1'                                         0.0000   \n",
       "'PNMA1'                                           0.0000   \n",
       "\n",
       "                               HNSCC20_P3_H08_S92_comb  \\\n",
       "processed by Maxima enzyme                      0.0000   \n",
       "Lymph node                                      0.0000   \n",
       "classified  as cancer cell                      1.0000   \n",
       "classified as non-cancer cells                  0.0000   \n",
       "non-cancer cell type                            0.0000   \n",
       "'C9orf152'                                      0.0000   \n",
       "'RPS11'                                         6.8676   \n",
       "'ELMO2'                                         0.0000   \n",
       "'CREB3L1'                                       0.0000   \n",
       "'PNMA1'                                         5.3462   \n",
       "\n",
       "                                HNSCC20_P3_G06_S78_comb  \n",
       "processed by Maxima enzyme                       0.0000  \n",
       "Lymph node                                       0.0000  \n",
       "classified  as cancer cell                       1.0000  \n",
       "classified as non-cancer cells                   0.0000  \n",
       "non-cancer cell type                             0.0000  \n",
       "'C9orf152'                                       0.0000  \n",
       "'RPS11'                                          6.3146  \n",
       "'ELMO2'                                          3.5905  \n",
       "'CREB3L1'                                        0.0000  \n",
       "'PNMA1'                                          0.0000  \n",
       "\n",
       "[10 rows x 5902 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SP_df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5902"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cell_name_list = list(SP_df.columns)\n",
    "sample_cell_pair = [[c] + [c.split('_')[0]] for c in cell_name_list]\n",
    "\n",
    "len(cell_name_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cell_id</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_id</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>HN23</th>\n",
       "      <td>51</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN25</th>\n",
       "      <td>224</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN26</th>\n",
       "      <td>214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN28</th>\n",
       "      <td>355</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC</th>\n",
       "      <td>282</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC10</th>\n",
       "      <td>89</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC12</th>\n",
       "      <td>157</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC13</th>\n",
       "      <td>90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC16</th>\n",
       "      <td>596</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC17</th>\n",
       "      <td>490</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC18</th>\n",
       "      <td>567</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC20</th>\n",
       "      <td>691</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC22</th>\n",
       "      <td>201</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC24</th>\n",
       "      <td>130</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC25</th>\n",
       "      <td>501</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC26</th>\n",
       "      <td>214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC28</th>\n",
       "      <td>261</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC5</th>\n",
       "      <td>370</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC6</th>\n",
       "      <td>308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC7</th>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HNSCC8</th>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           cell_id\n",
       "sample_id         \n",
       "HN23            51\n",
       "HN25           224\n",
       "HN26           214\n",
       "HN28           355\n",
       "HNSCC          282\n",
       "HNSCC10         89\n",
       "HNSCC12        157\n",
       "HNSCC13         90\n",
       "HNSCC16        596\n",
       "HNSCC17        490\n",
       "HNSCC18        567\n",
       "HNSCC20        691\n",
       "HNSCC22        201\n",
       "HNSCC24        130\n",
       "HNSCC25        501\n",
       "HNSCC26        214\n",
       "HNSCC28        261\n",
       "HNSCC5         370\n",
       "HNSCC6         308\n",
       "HNSCC7           7\n",
       "HNSCC8         104"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SP_cell_meta_df = pd.DataFrame(sample_cell_pair, columns=['cell_id', 'sample_id'])\n",
    "SP_cell_meta_df = SP_cell_meta_df.set_index('cell_id')\n",
    "SP_cell_meta_df.reset_index().groupby('sample_id').count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "gene_list = list(SP_df.index[5:])\n",
    "SP_df2 = SP_df.loc[gene_list, cell_name_list]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>HN28_P15_D06_S330_comb</th>\n",
       "      <th>HN28_P6_G05_S173_comb</th>\n",
       "      <th>HN26_P14_D11_S239_comb</th>\n",
       "      <th>HN26_P14_H05_S281_comb</th>\n",
       "      <th>HN26_P25_H09_S189_comb</th>\n",
       "      <th>HN26_P14_H06_S282_comb</th>\n",
       "      <th>HN25_P25_C04_S316_comb</th>\n",
       "      <th>HN26_P25_A11_S107_comb</th>\n",
       "      <th>HN26_P25_C09_S129_comb</th>\n",
       "      <th>HNSCC26_P24_H05_S377_comb</th>\n",
       "      <th>...</th>\n",
       "      <th>HNSCC20_P3_B10_S22_comb</th>\n",
       "      <th>HNSCC20_P13_B11_S215_comb</th>\n",
       "      <th>HNSCC20_P3_C08_S32_comb</th>\n",
       "      <th>HNSCC17_P4_H03_S183_comb</th>\n",
       "      <th>HNSCC20_P3_F09_S69_comb</th>\n",
       "      <th>HNSCC17_P4_G12_S180_comb</th>\n",
       "      <th>HNSCC20_P13_C05_S221_comb</th>\n",
       "      <th>HNSCC17_P4_C12_S132_comb</th>\n",
       "      <th>HNSCC20_P3_H08_S92_comb</th>\n",
       "      <th>HNSCC20_P3_G06_S78_comb</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C9orf152</th>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.42761</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RPS11</th>\n",
       "      <td>6.0037</td>\n",
       "      <td>7.3006</td>\n",
       "      <td>7.28850</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>7.47420</td>\n",
       "      <td>6.9548</td>\n",
       "      <td>5.9743</td>\n",
       "      <td>6.4650</td>\n",
       "      <td>6.5628</td>\n",
       "      <td>7.9486</td>\n",
       "      <td>...</td>\n",
       "      <td>6.8747</td>\n",
       "      <td>5.96490</td>\n",
       "      <td>4.8252</td>\n",
       "      <td>7.5167</td>\n",
       "      <td>7.3249</td>\n",
       "      <td>5.3595</td>\n",
       "      <td>7.4281</td>\n",
       "      <td>6.8439</td>\n",
       "      <td>6.8676</td>\n",
       "      <td>6.3146</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ELMO2</th>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>5.2465</td>\n",
       "      <td>0.50487</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>3.4154</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>1.9613</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.62106</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>3.2863</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>3.5905</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CREB3L1</th>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PNMA1</th>\n",
       "      <td>5.1474</td>\n",
       "      <td>5.3329</td>\n",
       "      <td>2.83370</td>\n",
       "      <td>5.7507</td>\n",
       "      <td>0.19661</td>\n",
       "      <td>1.9949</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>2.1541</td>\n",
       "      <td>3.5815</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>...</td>\n",
       "      <td>3.0332</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>5.3462</td>\n",
       "      <td>0.0000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 5902 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          HN28_P15_D06_S330_comb  HN28_P6_G05_S173_comb  \\\n",
       "C9orf152                  0.0000                 0.0000   \n",
       "RPS11                     6.0037                 7.3006   \n",
       "ELMO2                     0.0000                 0.0000   \n",
       "CREB3L1                   0.0000                 0.0000   \n",
       "PNMA1                     5.1474                 5.3329   \n",
       "\n",
       "          HN26_P14_D11_S239_comb  HN26_P14_H05_S281_comb  \\\n",
       "C9orf152                 0.42761                  0.0000   \n",
       "RPS11                    7.28850                  0.0000   \n",
       "ELMO2                    0.00000                  5.2465   \n",
       "CREB3L1                  0.00000                  0.0000   \n",
       "PNMA1                    2.83370                  5.7507   \n",
       "\n",
       "          HN26_P25_H09_S189_comb  HN26_P14_H06_S282_comb  \\\n",
       "C9orf152                 0.00000                  0.0000   \n",
       "RPS11                    7.47420                  6.9548   \n",
       "ELMO2                    0.50487                  0.0000   \n",
       "CREB3L1                  0.00000                  0.0000   \n",
       "PNMA1                    0.19661                  1.9949   \n",
       "\n",
       "          HN25_P25_C04_S316_comb  HN26_P25_A11_S107_comb  \\\n",
       "C9orf152                  0.0000                  0.0000   \n",
       "RPS11                     5.9743                  6.4650   \n",
       "ELMO2                     0.0000                  3.4154   \n",
       "CREB3L1                   0.0000                  0.0000   \n",
       "PNMA1                     0.0000                  2.1541   \n",
       "\n",
       "          HN26_P25_C09_S129_comb  HNSCC26_P24_H05_S377_comb  ...  \\\n",
       "C9orf152                  0.0000                     0.0000  ...   \n",
       "RPS11                     6.5628                     7.9486  ...   \n",
       "ELMO2                     0.0000                     1.9613  ...   \n",
       "CREB3L1                   0.0000                     0.0000  ...   \n",
       "PNMA1                     3.5815                     0.0000  ...   \n",
       "\n",
       "          HNSCC20_P3_B10_S22_comb  HNSCC20_P13_B11_S215_comb  \\\n",
       "C9orf152                   0.0000                    0.00000   \n",
       "RPS11                      6.8747                    5.96490   \n",
       "ELMO2                      0.0000                    0.62106   \n",
       "CREB3L1                    0.0000                    0.00000   \n",
       "PNMA1                      3.0332                    0.00000   \n",
       "\n",
       "          HNSCC20_P3_C08_S32_comb  HNSCC17_P4_H03_S183_comb  \\\n",
       "C9orf152                   0.0000                    0.0000   \n",
       "RPS11                      4.8252                    7.5167   \n",
       "ELMO2                      0.0000                    0.0000   \n",
       "CREB3L1                    0.0000                    0.0000   \n",
       "PNMA1                      0.0000                    0.0000   \n",
       "\n",
       "          HNSCC20_P3_F09_S69_comb  HNSCC17_P4_G12_S180_comb  \\\n",
       "C9orf152                   0.0000                    0.0000   \n",
       "RPS11                      7.3249                    5.3595   \n",
       "ELMO2                      0.0000                    0.0000   \n",
       "CREB3L1                    0.0000                    0.0000   \n",
       "PNMA1                      0.0000                    0.0000   \n",
       "\n",
       "          HNSCC20_P13_C05_S221_comb  HNSCC17_P4_C12_S132_comb  \\\n",
       "C9orf152                     0.0000                    0.0000   \n",
       "RPS11                        7.4281                    6.8439   \n",
       "ELMO2                        0.0000                    3.2863   \n",
       "CREB3L1                      0.0000                    0.0000   \n",
       "PNMA1                        0.0000                    0.0000   \n",
       "\n",
       "          HNSCC20_P3_H08_S92_comb  HNSCC20_P3_G06_S78_comb  \n",
       "C9orf152                   0.0000                   0.0000  \n",
       "RPS11                      6.8676                   6.3146  \n",
       "ELMO2                      0.0000                   3.5905  \n",
       "CREB3L1                    0.0000                   0.0000  \n",
       "PNMA1                      5.3462                   0.0000  \n",
       "\n",
       "[5 rows x 5902 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SP_df2.index = [g.replace(\"'\", \"\") for g in SP_df2.index]\n",
    "SP_df2 = SP_df2.astype(float)\n",
    "SP_df2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "SP_tpm_df = (np.power(2, SP_df2) - 1) * 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((23686,), 23686)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SP_tpm_df.index.shape, len(set(SP_tpm_df.index))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Ankur S. et al."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Read TPM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "gene_df = pd.read_csv('../data/HN_patient_specific/gene.csv', index_col='gene_id')\n",
    "gene_id_name_dict = dict(zip(gene_df.index, gene_df['gene_name']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "AS_tpm_df = pd.read_csv('../data/HN_patient_specific/hn_sc_tpm.csv', index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>RHH2176</th>\n",
       "      <th>RHH2177</th>\n",
       "      <th>RHH2178</th>\n",
       "      <th>RHH2179</th>\n",
       "      <th>RHH2180</th>\n",
       "      <th>RHH2181</th>\n",
       "      <th>RHH2182</th>\n",
       "      <th>RHH2183</th>\n",
       "      <th>RHH2184</th>\n",
       "      <th>RHH2185</th>\n",
       "      <th>...</th>\n",
       "      <th>RHO707</th>\n",
       "      <th>RHO708</th>\n",
       "      <th>RHO709</th>\n",
       "      <th>RHO711</th>\n",
       "      <th>RHO712</th>\n",
       "      <th>RHO713</th>\n",
       "      <th>RHO714</th>\n",
       "      <th>RHO715</th>\n",
       "      <th>RHO716</th>\n",
       "      <th>RHO717</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>TSPAN6</th>\n",
       "      <td>49.14</td>\n",
       "      <td>48.85</td>\n",
       "      <td>131.23</td>\n",
       "      <td>178.24</td>\n",
       "      <td>23.44</td>\n",
       "      <td>153.69</td>\n",
       "      <td>9.38</td>\n",
       "      <td>96.24</td>\n",
       "      <td>81.05</td>\n",
       "      <td>10.09</td>\n",
       "      <td>...</td>\n",
       "      <td>46.83</td>\n",
       "      <td>127.02</td>\n",
       "      <td>121.74</td>\n",
       "      <td>101.40</td>\n",
       "      <td>27.24</td>\n",
       "      <td>72.80</td>\n",
       "      <td>121.54</td>\n",
       "      <td>131.68</td>\n",
       "      <td>18.73</td>\n",
       "      <td>45.72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DPM1</th>\n",
       "      <td>0.00</td>\n",
       "      <td>25.66</td>\n",
       "      <td>123.04</td>\n",
       "      <td>159.22</td>\n",
       "      <td>110.39</td>\n",
       "      <td>115.40</td>\n",
       "      <td>51.75</td>\n",
       "      <td>47.94</td>\n",
       "      <td>25.92</td>\n",
       "      <td>12.69</td>\n",
       "      <td>...</td>\n",
       "      <td>95.78</td>\n",
       "      <td>316.68</td>\n",
       "      <td>24.94</td>\n",
       "      <td>41.63</td>\n",
       "      <td>28.49</td>\n",
       "      <td>247.13</td>\n",
       "      <td>326.69</td>\n",
       "      <td>151.61</td>\n",
       "      <td>27.20</td>\n",
       "      <td>36.17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SCYL3</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>26.77</td>\n",
       "      <td>4.13</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>...</td>\n",
       "      <td>2.41</td>\n",
       "      <td>3.62</td>\n",
       "      <td>1.41</td>\n",
       "      <td>5.64</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.39</td>\n",
       "      <td>0.00</td>\n",
       "      <td>5.66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C1orf112</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>29.61</td>\n",
       "      <td>2.47</td>\n",
       "      <td>0.00</td>\n",
       "      <td>24.26</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>...</td>\n",
       "      <td>6.01</td>\n",
       "      <td>2.50</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.49</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.90</td>\n",
       "      <td>1.67</td>\n",
       "      <td>55.43</td>\n",
       "      <td>7.31</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CFH</th>\n",
       "      <td>0.00</td>\n",
       "      <td>7.30</td>\n",
       "      <td>75.75</td>\n",
       "      <td>0.00</td>\n",
       "      <td>32.28</td>\n",
       "      <td>0.00</td>\n",
       "      <td>4.91</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>...</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.32</td>\n",
       "      <td>1.62</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>7.88</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 1171 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          RHH2176  RHH2177  RHH2178  RHH2179  RHH2180  RHH2181  RHH2182  \\\n",
       "TSPAN6      49.14    48.85   131.23   178.24    23.44   153.69     9.38   \n",
       "DPM1         0.00    25.66   123.04   159.22   110.39   115.40    51.75   \n",
       "SCYL3        0.00     0.00    26.77     4.13     0.00     0.00     0.00   \n",
       "C1orf112     0.00     0.00     0.00    29.61     2.47     0.00    24.26   \n",
       "CFH          0.00     7.30    75.75     0.00    32.28     0.00     4.91   \n",
       "\n",
       "          RHH2183  RHH2184  RHH2185  ...  RHO707  RHO708  RHO709  RHO711  \\\n",
       "TSPAN6      96.24    81.05    10.09  ...   46.83  127.02  121.74  101.40   \n",
       "DPM1        47.94    25.92    12.69  ...   95.78  316.68   24.94   41.63   \n",
       "SCYL3        0.00     0.00     0.00  ...    2.41    3.62    1.41    5.64   \n",
       "C1orf112     0.00     0.00     0.00  ...    6.01    2.50    0.00    0.49   \n",
       "CFH          0.00     0.00     0.00  ...    0.00    0.00    0.00    0.00   \n",
       "\n",
       "          RHO712  RHO713  RHO714  RHO715  RHO716  RHO717  \n",
       "TSPAN6     27.24   72.80  121.54  131.68   18.73   45.72  \n",
       "DPM1       28.49  247.13  326.69  151.61   27.20   36.17  \n",
       "SCYL3       0.00    0.00    0.00    0.39    0.00    5.66  \n",
       "C1orf112    0.00    0.90    1.67   55.43    7.31    0.00  \n",
       "CFH         0.00    0.32    1.62    0.00    0.00    7.88  \n",
       "\n",
       "[5 rows x 1171 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "AS_tpm_df = AS_tpm_df.sort_index()\n",
    "AS_tpm_df.index = [gene_id_name_dict[g] for g in AS_tpm_df.index]\n",
    "AS_tpm_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((15185,), 15144)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "AS_tpm_df.index.shape, len(set(AS_tpm_df.index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15185, 1171)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "AS_tpm_df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For duplicated gene names, calculate sum"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((15144,), 15144)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "AS_tpm_df = AS_tpm_df.groupby(AS_tpm_df.index).sum()\n",
    "AS_tpm_df.index.shape, len(set(AS_tpm_df.index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1247, 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_id</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>HN120M</th>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN120P</th>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN137M</th>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN137P</th>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN137P2</th>\n",
       "      <td>192</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN148M</th>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN148P</th>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN159M</th>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN159P</th>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN160M</th>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN160P</th>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HN182M</th>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           index\n",
       "sample_id       \n",
       "HN120M        96\n",
       "HN120P        96\n",
       "HN137M        96\n",
       "HN137P        96\n",
       "HN137P2      192\n",
       "HN148M        96\n",
       "HN148P        96\n",
       "HN159M        96\n",
       "HN159P        96\n",
       "HN160M        95\n",
       "HN160P        96\n",
       "HN182M        96"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "AS_cell_meta_df = pd.read_csv('../preprocessed_data/HN_patient_specific/cell_info.csv', index_col=0)[['cell_line_id']]\n",
    "AS_cell_meta_df.columns = ['sample_id']\n",
    "\n",
    "print (AS_cell_meta_df.shape)\n",
    "AS_cell_meta_df.reset_index().groupby('sample_id').count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###### Read mat_norm and preprocess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>RHH2176</th>\n",
       "      <th>RHH2177</th>\n",
       "      <th>RHH2178</th>\n",
       "      <th>RHH2179</th>\n",
       "      <th>RHH2180</th>\n",
       "      <th>RHH2181</th>\n",
       "      <th>RHH2182</th>\n",
       "      <th>RHH2183</th>\n",
       "      <th>RHH2184</th>\n",
       "      <th>RHH2185</th>\n",
       "      <th>...</th>\n",
       "      <th>RHO707</th>\n",
       "      <th>RHO708</th>\n",
       "      <th>RHO709</th>\n",
       "      <th>RHO711</th>\n",
       "      <th>RHO712</th>\n",
       "      <th>RHO713</th>\n",
       "      <th>RHO714</th>\n",
       "      <th>RHO715</th>\n",
       "      <th>RHO716</th>\n",
       "      <th>RHO717</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A2ML1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.055681</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAAS</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.433299</td>\n",
       "      <td>0.818982</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.165648</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.731827</td>\n",
       "      <td>0.104433</td>\n",
       "      <td>0.956780</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.157415</td>\n",
       "      <td>0.240023</td>\n",
       "      <td>0.052505</td>\n",
       "      <td>0.01466</td>\n",
       "      <td>0.141820</td>\n",
       "      <td>0.078456</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AACS</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.022271</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.010963</td>\n",
       "      <td>0.157415</td>\n",
       "      <td>0.460138</td>\n",
       "      <td>0.035308</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.073422</td>\n",
       "      <td>0.208026</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.061757</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AADAC</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.740419</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.548487</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.370797</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AADAT</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.764792</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.789582</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 1171 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       RHH2176   RHH2177   RHH2178  RHH2179   RHH2180   RHH2181  RHH2182  \\\n",
       "A2ML1      0.0  0.000000  0.000000      0.0  0.000000  0.000000      0.0   \n",
       "AAAS       0.0  0.433299  0.818982      0.0  0.165648  0.000000      0.0   \n",
       "AACS       0.0  0.000000  0.000000      0.0  0.000000  0.022271      0.0   \n",
       "AADAC      0.0  0.000000  0.740419      0.0  0.000000  0.000000      0.0   \n",
       "AADAT      0.0  0.000000  0.000000      0.0  0.000000  0.764792      0.0   \n",
       "\n",
       "        RHH2183   RHH2184   RHH2185  ...  RHO707    RHO708    RHO709  \\\n",
       "A2ML1  0.000000  0.000000  0.055681  ...     0.0  0.000000  0.000000   \n",
       "AAAS   0.731827  0.104433  0.956780  ...     0.0  0.000000  0.157415   \n",
       "AACS   0.000000  0.000000  0.000000  ...     0.0  0.010963  0.157415   \n",
       "AADAC  0.000000  0.000000  0.000000  ...     0.0  0.000000  0.548487   \n",
       "AADAT  0.000000  0.000000  0.789582  ...     0.0  0.000000  0.000000   \n",
       "\n",
       "         RHO711    RHO712   RHO713    RHO714    RHO715  RHO716    RHO717  \n",
       "A2ML1  0.000000  0.000000  0.00000  0.000000  0.000000     0.0  0.000000  \n",
       "AAAS   0.240023  0.052505  0.01466  0.141820  0.078456     0.0  0.000000  \n",
       "AACS   0.460138  0.035308  0.00000  0.073422  0.208026     0.0  0.061757  \n",
       "AADAC  0.000000  0.000000  0.00000  0.000000  0.370797     0.0  0.000000  \n",
       "AADAT  0.000000  0.000000  0.00000  0.000000  0.000000     0.0  0.000000  \n",
       "\n",
       "[5 rows x 1171 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mat_norm_df = pd.read_csv('../data/HN_patient_specific/hn_mat_norm.csv', index_col=0)\n",
    "mat_norm_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Save TPM and cell info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "out_dir = '../preprocessed_data/scRNAseq/'\n",
    "\n",
    "AS_tpm_df.to_csv(out_dir+'AS_et_al_tpm.csv')\n",
    "np.log2(AS_tpm_df+1).to_csv(out_dir+'AS_et_al_log2tpm.csv')\n",
    "AS_cell_meta_df.to_csv(out_dir+'AS_et_al_cell_metadata.csv')\n",
    "\n",
    "AP_tpm_df.to_csv(out_dir+'AP_et_al_tpm.csv')\n",
    "SP_tpm_df.to_csv(out_dir+'SP_et_al_tpm.csv')\n",
    "\n",
    "np.log2(AP_tpm_df+1).to_csv(out_dir+'AP_et_al_log2tpm.csv')\n",
    "np.log2(SP_tpm_df+1).to_csv(out_dir+'SP_et_al_log2tpm.csv')\n",
    "\n",
    "AP_cell_meta_df.to_csv(out_dir+'AP_et_al_cell_metadata.csv')\n",
    "SP_cell_meta_df.to_csv(out_dir+'SP_et_al_cell_metadata.csv')\n",
    "\n",
    "# Save the regenerated mat_norm\n",
    "\n",
    "mat_norm_df.to_csv(out_dir+'AS_et_al_mat_norm.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sum up TPM per sample to obtain bulk profile \n",
    "\n",
    "```Old note``` We used log2(TPM+1) for scRNA-seq HN dataset and other scRNA-seq datasets, while TCGA FPKM is not in log2. This is not consistant, but we just to keep all the previous analysis not to be changed.\n",
    "\n",
    "```Dec 2019``` As we have to rerun everything for scRNA-seq, so use TPM instead of log2 TPM."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "for ds_name, tpm_df, meta_df in zip(['AS_et_al', 'AP_et_al', 'SP_et_al'], \n",
    "                                    [np.log2(AS_tpm_df+1), np.log2(AP_tpm_df+1), np.log2(SP_tpm_df+1)], \n",
    "                                    [AS_cell_meta_df, AP_cell_meta_df, SP_cell_meta_df]):\n",
    "    \n",
    "    sum_df = pd.merge(tpm_df.T, meta_df, left_index=True, right_index=True).groupby('sample_id').sum()\n",
    "    sum_df.T.to_csv(out_dir + ds_name + '_sample_sum_log2tpm.tsv', sep='\\t')\n",
    "    \n",
    "    avg_df = pd.merge(tpm_df.T, meta_df, left_index=True, right_index=True).groupby('sample_id').mean()\n",
    "    avg_df.T.to_csv(out_dir + ds_name + '_sample_avg_log2tpm.tsv', sep='\\t')\n",
    "    \n",
    "for ds_name, tpm_df, meta_df in zip(['AS_et_al', 'AP_et_al', 'SP_et_al'], \n",
    "                                    [AS_tpm_df, AP_tpm_df, SP_tpm_df], \n",
    "                                    [AS_cell_meta_df, AP_cell_meta_df, SP_cell_meta_df]):\n",
    "    \n",
    "    sum_df = pd.merge(tpm_df.T, meta_df, left_index=True, right_index=True).groupby('sample_id').sum()\n",
    "    sum_df.T.to_csv(out_dir + ds_name + '_sample_sum_tpm.tsv', sep='\\t')\n",
    "    \n",
    "    avg_df = pd.merge(tpm_df.T, meta_df, left_index=True, right_index=True).groupby('sample_id').mean()\n",
    "    avg_df.T.to_csv(out_dir + ds_name + '_sample_avg_tpm.tsv', sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "code_folding": [
     0
    ]
   },
   "outputs": [],
   "source": [
    "### TEMP: split SP for CIBERSORT ###\n",
    "# df = pd.read_csv(out_dir + \"SP_et_al_sample_avg_log2tpm.tsv\", sep='\\t', index_col=0)\n",
    "\n",
    "# cl_list = df.columns\n",
    "# n_size = 7\n",
    "\n",
    "# idx = 0\n",
    "# for i in range(3):\n",
    "#     from_idx = idx+(n_size*i)\n",
    "#     to_idx = idx+(n_size*(i+1))\n",
    "#     selected_cl_list = cl_list[from_idx: to_idx]\n",
    "#     print (selected_cl_list)\n",
    "    \n",
    "#     selected_df = df[selected_cl_list]\n",
    "#     selected_df.to_csv(out_dir + \"SP_et_al_sample_avg_log2tpm_{}of3.tsv\".format(i+1), sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "for ds_name, exp_df, meta_df in zip(['AS_et_al'], \n",
    "                                    [mat_norm_df], \n",
    "                                    [AS_cell_meta_df]):\n",
    "    \n",
    "    sum_df = pd.merge(mat_norm_df.T, meta_df, left_index=True, right_index=True).groupby('sample_id').sum()\n",
    "    sum_df.T.to_csv(out_dir + ds_name + '_sample_sum_mat_norm.tsv', sep='\\t')\n",
    "    \n",
    "    avg_df = pd.merge(mat_norm_df.T, meta_df, left_index=True, right_index=True).groupby('sample_id').mean()\n",
    "    avg_df.T.to_csv(out_dir + ds_name + '_sample_avg_mat_norm.tsv', sep='\\t')\n",
    "    \n",
    "    sum_df = pd.merge(mat_norm_df.T, meta_df, left_index=True, right_index=True).groupby('sample_id').quantile(.95)\n",
    "    sum_df.T.to_csv(out_dir + ds_name + '_sample_q95_mat_norm.tsv', sep='\\t')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "165px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
